# coding:utf-8

from scrapy.contrib.spiders import CrawlSpider
from ..items import JiwuItem
import bs4
import scrapy


class JiwuSpider(CrawlSpider):

    #
    # 抓取吉屋网 的规则
    #

    name = 'jiwuspider'
    allowed_domains = ['jiwu.com']
    start_urls = ['http://www.jiwu.com/index.html']

    def parse(self, response):
        data = response.body
        soup = bs4.BeautifulSoup(data, 'lxml')

        #
        # 获取网站所有 城市
        #

        citybox = soup.select('.section-four-a2 > li > a')
        for child in citybox:
            city = child.get_text()
            city_url = child.get('href') + '/loupan/'
            city_id = child.get('href').replace('http://', '').replace('.jiwu.com', '')
            print(city, city_id, city_url)
            citys = {
                'website': '吉屋', 'web_url': 'jiwu.com',
                'city': city, 'city_id': city_id, 'city_url': city_url
            }
            # if city == '安阳':
            yield scrapy.Request(city_url, callback=self.parse_city_area, meta=citys)

    def parse_city_area(self, response):
        meta = response.meta
        data = response.body
        soup = bs4.BeautifulSoup(data, 'lxml')

        #
        # 根据城市 获取下面的区域
        #

        areaboxs = soup.select('.lp-pb-s3')[0]
        areabox = areaboxs.select('li > a ')
        for child in areabox:
            area = child.get_text().split('(')[0]
            area_url = child.get('href')
            if area != '全部':
                # print(area, area_url)
                meta['area'] = area
                yield scrapy.Request(area_url, callback=self.parse_city_estate, meta=meta)

    def parse_city_estate(self, response):
        meta = response.meta
        data = response.body
        city = meta['city']
        city_url = meta['city_url']
        area = meta['area']
        soup = bs4.BeautifulSoup(data, 'lxml')

        #
        # 获取楼盘信息
        #

        estates = soup.find_all('a', class_='loupan-list1-s0')
        # print(estates)
        for child in estates:
            estate = child.get_text()
            estate_url = child.get('href')
            estate_id = child.get('href').replace(city_url, '').replace('.html', '')
            # print(city, estate, estate_id, estate_url)
            # if area == '东区':
            item = JiwuItem()
            item['website'] = meta['website']
            item['web_url'] = meta['web_url']
            item['city'] = city
            item['city_id'] = meta['city_id']
            item['area'] = area
            item['estate'] = estate
            item['estate_id'] = estate_id
            item['estate_url'] = estate_url

            yield item

        #
        # 进行翻页
        #

        next_page = soup.select('.tg-rownum-next')
        if next_page:
            next_url = next_page[0].get('href')
            yield scrapy.Request(next_url, callback=self.parse_city_estate, meta=meta)
